# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
# Cufflinks wrapper on plotly
import cufflinks
# Data science imports
import pandas as pd
import numpy as np
# Options for pandas
pd.options.display.max_columns = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
from src.load_datasets import load_datasets, load_input_dataset
split_count = 3 * 31 * 24 # three monthes in hours
df = load_input_dataset()
train = df[:-split_count]
test = df[-split_count:]
train
| timestamp | open | high | low | close | volume | |
|---|---|---|---|---|---|---|
| 0 | 1364770800000 | 93.20000 | 93.29000 | 92.90000 | 93.033000 | 116.001800 |
| 1 | 1364774400000 | 93.25000 | 100.00000 | 93.03000 | 93.100000 | 345.583889 |
| 2 | 1364778000000 | 93.37999 | 93.48797 | 93.10000 | 93.100000 | 45.243335 |
| 3 | 1364781600000 | 93.17000 | 94.00000 | 93.10999 | 93.740000 | 466.311420 |
| 4 | 1364785200000 | 93.80000 | 93.80000 | 92.49999 | 92.700020 | 96.316180 |
| ... | ... | ... | ... | ... | ... | ... |
| 59051 | 1578520800000 | 8007.30000 | 8075.50000 | 7983.00000 | 8067.249696 | 379.821083 |
| 59052 | 1578524400000 | 8067.30000 | 8101.00000 | 8018.70000 | 8036.800000 | 386.988435 |
| 59053 | 1578528000000 | 8036.80000 | 8036.80000 | 7925.00000 | 7957.300000 | 367.179571 |
| 59054 | 1578531600000 | 7957.00000 | 8028.20000 | 7951.00000 | 7995.100000 | 178.885537 |
| 59055 | 1578535200000 | 7995.10000 | 8008.60000 | 7983.00000 | 7983.000000 | 33.761173 |
59056 rows × 6 columns
train.iplot(subplots=True)
test.iplot(subplots=True)
from sktime.utils.plotting import plot_series
train_label = train['close']
test_label = test['close']
plot_series(train_label)
(<Figure size 1152x288 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x7fc2173a9b00>)
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
# liner detrending
forecaster = PolynomialTrendForecaster(degree=10)
transformer = Detrender(forecaster=forecaster)
transformer = transformer.fit(train_label)
yt = transformer.transform(train_label)
yt_test = transformer.transform(test_label)
yt
0 93.033000
1 93.100000
2 93.100000
3 93.740000
4 92.700020
...
59051 -3596.361360
59052 -3629.939681
59053 -3712.569672
59054 -3677.901031
59055 -3693.133757
Length: 59056, dtype: float64
# internally, the Detrender uses the in-sample predictions
# of the PolynomialTrendForecaster
forecaster = PolynomialTrendForecaster(degree=10)
forecaster = forecaster.fit(train_label)
fh_ins = -np.arange(len(train_label)) # in-sample forecasting horizon
y_pred = forecaster.predict(fh=fh_ins)
fh_ins = np.arange(len(test_label))
y_pred_test = forecaster.predict(fh=fh_ins)
y_pred
0 1.504389e-68
1 8.421795e-33
2 2.155825e-30
3 5.524896e-29
4 5.518445e-28
...
59051 1.166361e+04
59052 1.166674e+04
59053 1.166987e+04
59054 1.167300e+04
59055 1.167613e+04
Length: 59056, dtype: float64
comparision = pd.DataFrame({
'label': train_label,
'trend': y_pred,
'residuals': yt
})
comparision.index = pd.to_datetime(train['timestamp'], unit='ms')
comparision.iplot()
comparision = pd.DataFrame({
'label': test_label,
'trend': y_pred_test,
'residuals': yt_test
})[:-1]
len(comparision)
len(test['timestamp'])
comparision.index = pd.to_datetime(test['timestamp'], unit='ms')
comparision.iplot()
2232
2232
from src.load_datasets import load_input_dataset
df = load_input_dataset()
df
| timestamp | open | high | low | close | volume | |
|---|---|---|---|---|---|---|
| 0 | 1364770800000 | 93.200000 | 93.29000 | 92.90000 | 93.03300 | 116.001800 |
| 1 | 1364774400000 | 93.250000 | 100.00000 | 93.03000 | 93.10000 | 345.583889 |
| 2 | 1364778000000 | 93.379990 | 93.48797 | 93.10000 | 93.10000 | 45.243335 |
| 3 | 1364781600000 | 93.170000 | 94.00000 | 93.10999 | 93.74000 | 466.311420 |
| 4 | 1364785200000 | 93.800000 | 93.80000 | 92.49999 | 92.70002 | 96.316180 |
| ... | ... | ... | ... | ... | ... | ... |
| 61283 | 1615298400000 | 54838.000000 | 54839.00000 | 54450.00000 | 54610.00000 | 455.352289 |
| 61284 | 1615302000000 | 54610.000000 | 54610.00000 | 54002.00000 | 54171.00000 | 301.784871 |
| 61285 | 1615305600000 | 54175.000000 | 54402.00000 | 53850.00000 | 54189.40014 | 127.449982 |
| 61286 | 1615309200000 | 54188.457855 | 54321.00000 | 53867.00000 | 54007.00000 | 237.987207 |
| 61287 | 1615312800000 | 53891.000000 | 53891.00000 | 53755.00000 | 53778.00000 | 22.735552 |
61288 rows × 6 columns
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sklearn.linear_model import LogisticRegression
label = df['close']
degree = 10
# liner detrending
# regressor = LogisticRegressor()
forecaster = PolynomialTrendForecaster(degree=degree)
transformer = Detrender(forecaster=forecaster)
yt = transformer.fit_transform(label)
# internally, the Detrender uses the in-sample predictions
# of the PolynomialTrendForecaster
forecaster = PolynomialTrendForecaster(degree=degree)
fh_ins = -np.arange(len(label)) # in-sample forecasting horizon
y_pred = forecaster.fit(label).predict(fh=fh_ins)
comparision = pd.DataFrame({
'label': label,
'trend': y_pred,
'residuals': yt
})
comparision.index = pd.to_datetime(df['timestamp'], unit='ms')
comparision.iplot()
from src.load_datasets import split_train_test
from src.prepare_datasets import get_full_features_dataset, get_scaler, normalize_dataset
df = get_full_features_dataset()
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df['timestamp'] = pd.to_datetime(df['timestamp'])
df
df[['open', 'close', 'high', 'low', 'volume']][::15].iplot(subplots=True)
Will add time related features and select only important
labels = df.pop('close')
labels[::15].iplot()
df
df['id'] = 1
df
from tsfresh import extract_relevant_features
direct_features = extract_relevant_features(df, labels, column_id='id', column_sort='timestamp')
direct_features
import scipy.stats as stats
import pylab
close_change = train_features['close'].pct_change()[1:]
close_change.head()
stats.probplot(close_change, dist='norm', plot=pylab)
import tensorflow as tf
import matplotlib.pyplot as plt
def plot_log_freaquency(series):
fft = tf.signal.rfft(series)
f_per_dataset = np.arange(0, len(fft))
n_samples_d = len(series)
days_per_year = 365
years_per_dataset = n_samples_d/(days_per_year)
f_per_year = f_per_dataset/years_per_dataset
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.xticks([1, 365], labels=['1/Year', '1/day'])
_ = plt.xlabel('Frequency (log scale)')
plot_log_freaquency(train_features['close'])
plot_log_freaquency(train_features['close'].diff().dropna())
plot_log_freaquency(train_features['volume'])
plot_log_freaquency(train_features['volume'].diff().dropna())
import sweetviz as sv
compare_report = sv.compare([train_features, 'Train data'], [test_features, 'Test data'], "close")
compare_report.show_notebook()
train_features[59::60].iplot(subplots=True)
test_features[59::60].iplot(subplots=True)
Will use only training mean and deviation for not give NN access to test dataset
Divide by the max-min deviation
pd.set_option('float_format', '{:.2f}'.format)
train_features.describe()
test_features.describe()
maximum for training to litle, and not will allow correctly predict values in testing dataset, will use manually choosed value for maximum 100 thouthands dollars except of volume
from sklearn.preprocessing import MinMaxScaler
import numpy as np
train_min = np.min(train_features)
train_max = np.max(train_features)
MAX_TARGET = 100000
train_max['high'] = MAX_TARGET
train_max['low'] = MAX_TARGET
train_max['open'] = MAX_TARGET
train_max['close'] = MAX_TARGET
train_fit = pd.DataFrame([train_min, train_max])
scaler = MinMaxScaler()
scaler = scaler.fit(train_fit)
print("normalise train dataset...")
train_normalised = pd.DataFrame(scaler.transform(train_features))
train_normalised.columns = train_features.columns
train_normalised.index = train_features.index
print("normalise test dataset...")
test_normalised = pd.DataFrame(scaler.transform(test_features))
test_normalised.columns = test_features.columns
test_normalised.index = test_features.index
train_normalised.head()
train_normalised[59::60].iplot(subplots=True, title="Train")
test_normalised[59::60].iplot(subplots=True, title="Test")
train_in_hours = train_features[59::60]
feature2normaliesd = pd.DataFrame({
'Real': train_in_hours['close'],
'Normalised': train_normalised['close'][59::60]
})
feature2normaliesd.index = train_in_hours.index
feature2normaliesd.iplot(subplots=True)